########################################################
## This code defines the functions that perform training and
## prediction for our three machine learning algorithms
########################################################

library(haven)
library(caret)
library(randomForest)
library(doParallel)
library(mice)
library(plyr)
library(dplyr)


## Training + predicting:
 
estimating <- function(data, 
                       dependent, 
                       parameters, 
                       s_tuning = 0.1, 
                       s_training = 0.3, 
                       firstvar = "Gender", 
                       seed = 2111,
                       noisily = FALSE) {
  
  start_time <- Sys.time()
  
  ## Set seed at the beginning every time:
  set.seed(seed, "L'Ecuyer-CMRG")
  
  ########################################################
  ## Define the sub-sample for creation of predictions
  ########################################################
  
  ## Keeping only 30% of sample for creation of predictions
  n_parametertuning <- round(nrow(data)*s_tuning, digits=0) + 1
  n_training <- round((nrow(data)*s_training), digits=0)
  n_training_plus_tuning <- n_training + n_parametertuning
  n_training_plus_tuning_1 <- n_training_plus_tuning + 1
  
  first_column <- which(colnames(data)==firstvar) ## Identify column number where the variables of interest start
  
  ## Keeping only those observations with non-missing outcome variables
  y_column <- which(colnames(data) == dependent) ## Identify column number with the dependent variable of interest
  data <- data[complete.cases(data[, y_column:y_column]), ] ## Restrict the dataset to observations with non-missing dependent variable
  
  ## Creating dataset for training and predictions
  training <- data[data$n_order >= n_parametertuning & data$n_order <= n_training_plus_tuning,] ## Data for training
  data_pred <- data[data$n_order >= n_training_plus_tuning_1,] ## Data for predictions (hold-out sample)
  
  training_y <- factor(training[[dependent]])
  training_x <- training[, first_column:ncol(training)] ## Keeping all the possible covariates we want to have in the model
  training_final <- as.data.frame(cbind(training_y, training_x)) ## Creating final dataset used for training
  
  total_y <- factor(data_pred[[dependent]])
  total_x <- data_pred[,first_column:ncol(data_pred)] ## Keeping all the possible covariates we want to have in the model
  total_final <- as.data.frame(cbind(total_y, total_x)) ## Creating final dataset used for predictions
  
  persinfo <- as.data.frame(cbind(data_pred$LopNr_PersonNr, data_pred$n)) ## Creating a dataset with individual ID and n for the predictions
  
  ## Correcting format of outcome variables
  levels(total_final$total_y) <- c("no", "yes")
  levels(training_final$training_y) <- c("no", "yes")
  
  ########################################################
  ## Making the predictions
  ########################################################    
  
  ## Predictions 1: Random Forest
  
  if (noisily) {
    print(paste0("Training Random Forest, ", Sys.time()))
  }
  
  rff_final <- ranger(training_y~., data = training_final, num.trees = 250,
                     mtry = parameters$rfgrid_final$mtry, min.node.size = parameters$rfgrid_final$min.node.size,
                     sample.fraction = 0.1, splitrule = "gini", save.memory = TRUE, probability = TRUE, importance='impurity')
  
  ## Predictions 2: Boosted Gradient
  
  if (noisily) {
    print(paste0("Training Boosted Gradient, ", Sys.time()))
  }
  
  ## Need to re-format data before feeding it to xgboost
  x_boost <- data.matrix(training_x)
  y_boost <- as.vector(training_y)
  
  rboost_final <- xgboost(data = x_boost, label = y_boost, 
                         eta = parameters$boostgrid_final$eta, nround = parameters$boostgrid_final$nrounds,
                         subsample = 0.1, objective = "binary:logistic", eval_metric = "auc")
  

  ## Predictions 3: LASSO
  
  if (noisily) {
    print(paste0("Training LASSO, ", Sys.time()))
  }
  
  ## Need to re-format data before feeding it to glmnet
  x_lasso = data.matrix(training_x)
  y_lasso = as.vector(training_y)
  
  rlasso_final <- glmnet(x = x_lasso, y = y_lasso, family = "binomial",
                        alpha = parameters$lassogrid_final$alpha, lambda = parameters$lassogrid_final$lambda,
                        subsample = 0.1)
  

  ########################################################
  ## Identify the importance of variables
  ########################################################    
  
  ## Random forest
  importanceColumn <-c(importance(rff_final))
  print(importanceColumn)

  ## Boosted Gradient
  v<-as.vector(xgb.importance(model=rboost_final)$Feature)
  w<-as.vector(xgb.importance(model=rboost_final)$Gain)
  DF<-as.data.frame(cbind(w,v))

  ## LASSO
  imp.lasso <- varImp(rlasso_final, lambda = parameters$lassogrid_final$lambda)
  
  importance <- list(rf = importanceColumn, rboost = DF, lasso = imp.lasso)
  
  ########################################################
  ## Save models
  ########################################################
  
  models <- list(rff_final = rff_final, 
                 rboost_final = rboost_final, 
                 rlasso_final = rlasso_final)
 
  ########################################################
  ## Create predictions
  ########################################################
  
  ## Creating predictions Random Forest
  prob_rf <- predict(rff_final, total_x)
  prob_rf <- prob_rf[["predictions"]]
  prob_rf <- data.frame(prob_rf[,2])
  
  ## Creating predictions Gradient Boost
  total_x_boost = data.matrix(total_x)
  prob_boost <- predict(rboost_final, total_x_boost)

  ## Creating predictions LASSO
  total_x_lasso = data.matrix(total_x)
  prob_lasso <- predict(rlasso_final, total_x_lasso, type = "response")

  ########################################################
  ## Save predictions
  ########################################################
  output <- cbind(total_y, prob_rf, prob_boost, prob_lasso, persinfo) ## put all the predictions together, with personal ID, n and outcome variable

  ## Return:
  return(list(importance = importance, models = models, output = output))

  
  if (noisily) {
    print(paste0("Training time: ", difftime(Sys.time(), start_time, units = c("mins")), "m"))
  }
}


## Here follows a simpler function that only returns the models (this is useful
## sometimes when we want to train the model on a particular sample, but we
## don't want to use the usual sample split procedure):

training.int <- function(data,
                         persinfo,
                         dependent, 
                         parameters, 
                         seed = NULL,
                         noisily = FALSE) {
  
  start_time <- Sys.time()
  
  ## Set seed at the beginning every time:
  if (!is.null(seed)) { 
    set.seed(seed, "L'Ecuyer-CMRG")
  }
  
  ########################################################
  ## Making the predictions
  ########################################################    
  
  ## Predictions 1: Random Forest
  
  if (noisily) {
    print(paste0("----> Training Random Forest, ", Sys.time()))
  }
  
  rff_final <- ranger(as.formula(paste0(dependent, " ~ .")), 
                      data = data, 
                      num.trees = 250,
                      mtry = parameters$rfgrid_final$mtry, 
                      min.node.size = parameters$rfgrid_final$min.node.size,
                      sample.fraction = 0.1, 
                      splitrule = "gini", 
                      save.memory = TRUE, 
                      probability = TRUE, 
                      importance='impurity')
  
  ## Predictions 2: Boosted Gradient
  
  if (noisily) {
    print(paste0("----> Training Boosted Gradient, ", Sys.time()))
  }
  
  ## Need to re-format data before feeding it to xgboost
  x_boost <- data.matrix(data[, setdiff(names(data), dependent)])
  y_boost <- c(0, 1)[as.numeric(data[[dependent]])]
  head(y_boost, n = 20)
  if (any(is.na(y_boost))) {
    return(y_boost)
    stop("THERE ARE NAs IN THE Y VECTOR")
  }
  
  rboost_final <- xgboost(data = x_boost, 
                          label = y_boost, 
                          eta = parameters$boostgrid_final$eta, 
                          nround = parameters$boostgrid_final$nrounds,
                          subsample = 0.1, 
                          objective = "binary:logistic", 
                          eval_metric = "auc",
                          verbose = 0)
  
  
  ## Predictions 3: LASSO
  
  if (noisily) {
    print(paste0("----> Training LASSO, ", Sys.time()))
  }
  
  ## Need to re-format data before feeding it to glmnet
  x_lasso <- data.matrix(data[, setdiff(names(data), dependent)])
  y_lasso <- c(0, 1)[as.numeric(data[[dependent]])]
  
  rlasso_final <- glmnet(x = x_lasso, 
                         y = y_lasso, 
                         family = "binomial",
                         alpha = parameters$lassogrid_final$alpha, 
                         lambda = parameters$lassogrid_final$lambda,
                         subsample = 0.1)
  
  ########################################################
  ## Return models
  ########################################################
  
  return(list(rff_final = rff_final, 
              rboost_final = rboost_final, 
              rlasso_final = rlasso_final))
  
  
  if (noisily) {
    print(paste0("Training time: ", difftime(Sys.time(), start_time, units = c("mins")), "m"))
  }
}